#install.packages("readstata13")
library(tidyverse)
library(readstata13)
library(stargazer)
library(dplyr)
library(ggplot2)
library(dplyr)
setwd("C:/Sinjae Kang/study hard/작업 완료 논문들/(2025.04)대선토론(with 지영, 김정현 교수님)/Harvard Daterverse")
eai <- read.dta13("P2022_stateclass.dta")

eai <- eai %>% mutate(num.watch = case_when(Q12 == 6 ~ 0,
                                            
                                            Q12 == 1 ~ 1,
                                            
                                            Q12 == 2 ~ 2,
                                            
                                            Q12 == 3 ~ 3,
                                            
                                            Q12 == 4 ~ 4,
                                            
                                            Q12 == 5 ~ 5))


eai <- eai %>% mutate(policy.based = case_when(Q4_1 == 1 |  Q4_1 == 3 | Q4_1 == 7 | Q4_1 == 8 | Q4_1 == 9 ~ 1,
                                               
                                               Q4_1 == 2 | Q4_1 == 4| Q4_1 == 5 |Q4_1 == 6 | Q4_1 == 10 | Q4_1 == 11   ~ 0))

table(eai$policy.based) # 1=458 , 0=584

eai <- eai %>% mutate(info.source = case_when(Q10 == 1 | Q10 == 2 ~ "TV",
                                              
                                              Q10 == 3 ~ "newspaper",
                                              Q10 == 4 ~ "radio",
                                              Q10 == 5 ~ "newspaper",
                                              Q10 == 6 | Q10 == 7 ~ "social media",
                                              Q10 == 8 ~ "podcast",
                                              Q10 == 9 | Q10 == 10 | Q10 == 11 | Q10 == 12 ~ "other"))
eai <- eai %>% mutate(info.source = as.factor(info.source),
                      info.source = fct_relevel(info.source, "other"))

eai <- eai %>%  ## Added for R&R
  mutate(info.source_newspaper = if_else(info.source == "newspaper", 1, 0))

eai <- eai %>% mutate(gov.trust = case_when(Q17 ==1 ~ 4,
                                            Q17 == 2 ~ 3,
                                            Q17 == 3 ~ 2,
                                            Q17 == 4 ~ 1))

eai <- eai %>% mutate(female = case_when(SQ2_T1 == 1 ~ 0, 
                                         SQ2_T1 == 2 ~ 1),
                      voted = case_when(Q1 == 1 ~ 1,
                                        Q1 == 2 ~ 1,
                                        Q1 == 3 ~ 0,
                                        Q1 == 4 ~ 0))

# Change minds
table(eai$Q14)
eai <- eai %>% mutate(consistent = case_when(Q14 == 1 | Q14 == 2 | Q14 == 3 ~ 1,
                                             Q14 == 4 | Q14 == 5 ~ 0))

# Levels of trust in election
table(eai$Q7)

# This presidential election is a referendum on the Moon Jae-in administration (x)
table(eai$Q6_1)

# Policy competition between candidates has intensified compared to the last presidential election
table(eai$Q6_4)

# Democratic satisfaction
table(eai$Q26)

eai <- eai %>% # Exclude 99 = missing values
  mutate(
    Q6_1 = ifelse(Q6_1 == 99, NA, Q6_1),
    policy_competition = ifelse(Q6_4 == 99, NA, Q6_4),
    demo_satisfaction = ifelse(Q26 == 99, NA, Q26)
  )

table(eai$demo_satisfaction)


# Subsample
yoon_data <- eai %>%
  filter(Yoon==1)

lee_data <- eai %>%
  filter(Lee==1)

#### Descriptive Statistics #### 08.18
library(dplyr)
library(psych)

# =============================================================================
# Extract only samples used in regression analysis (remove missing values)
# =============================================================================
# Select only variables used in regression model and remove missing values
analysis_data <- eai %>%
  dplyr::select(policy.based, num.watch, info.source, AGE_T1, Income, Edu, female,
                VoterIdeo, gov.trust, Yoon, demo_satisfaction, policy_competition) %>%
  filter(complete.cases(.))

cat("Full data N:", nrow(eai), "\n")
cat("Regression analysis data N:", nrow(analysis_data), "\n\n")

# =============================================================================
# Convert info.source to dummy variables (include all categories)
# =============================================================================
analysis_data <- analysis_data %>%
  mutate(
    info_other = ifelse(info.source == "other", 1, 0),
    info_newspaper = ifelse(info.source == "newspaper", 1, 0),
    info_podcast = ifelse(info.source == "podcast", 1, 0),
    info_radio = ifelse(info.source == "radio", 1, 0),
    info_social = ifelse(info.source == "social media", 1, 0),
    info_tv = ifelse(info.source == "TV", 1, 0)
  )

# =============================================================================
# Summary for publication table
# =============================================================================
cat("\n=== Summary for Publication Table ===\n")

# Continuous variable summary (Mean, SD, Min, Max, N)
continuous_summary <- analysis_data %>% 
  dplyr::select(policy.based, info_newspaper, info_podcast, info_radio, 
                info_social, info_tv, info_other, num.watch, AGE_T1, Income, 
                Edu, female, VoterIdeo, gov.trust, Yoon, demo_satisfaction, 
                policy_competition) %>%
  summarise_all(list(
    Mean = ~mean(.x, na.rm = TRUE),
    SD = ~sd(.x, na.rm = TRUE),
    Min = ~min(.x, na.rm = TRUE),
    Max = ~max(.x, na.rm = TRUE),
    N = ~sum(!is.na(.x))
  )) %>%
  gather(key, value) %>%
  separate(key, into = c("Variable", "Statistic"), sep = "_(?=[^_]+$)", extra = "merge") %>%
  spread(Statistic, value) %>%
  dplyr::select(Variable, N, Mean, SD, Min, Max)

print(continuous_summary)

#### R&R code
# Check not included control variable model

mod1_rr <- glm(policy.based ~    info.source , 
               data = eai, family = "binomial")

stargazer(mod1_rr, type='text') ## reference = others, newspaper is significant(p<0.1)

mod2_rr <- glm(policy.based ~    info.source_newspaper , 
               data = eai, family = "binomial")

stargazer(mod2_rr, type='text') ## newspaper dummy is not significant


eai <- eai %>% mutate(info.source2 = as.factor(info.source),
                      info.source2 = fct_relevel(info.source, "TV"))

mod3_rr <- glm(policy.based ~   info.source2 , 
               data = eai, family = "binomial")

stargazer(mod3_rr, type='text') ## reference = TV, newspaper is not significant

#### Subsample
eai <- eai %>% 
  mutate(IdeologyGroup = case_when(
    (VoterIdeo >= 0 & VoterIdeo <= 2) | (VoterIdeo >= 8 & VoterIdeo <= 10) ~ "Extreme",
    VoterIdeo >= 3 & VoterIdeo <= 7 ~ "Moderate",
    TRUE ~ NA_character_
  ))
# Check distribution of each ideology group
table(eai$IdeologyGroup, useNA = "ifany") # Extreme = 272 / Moderate = 815


# Keep original classification for more detailed analysis
eai <- eai %>%
  mutate(DetailedIdeology = case_when(
    VoterIdeo >= 0 & VoterIdeo <= 2 ~ "Progressive",
    VoterIdeo >= 3 & VoterIdeo <= 7 ~ "Moderate",
    VoterIdeo >= 8 & VoterIdeo <= 10 ~ "Conservative",
    TRUE ~ NA_character_
  ))

# Conduct separate analyses by group
eai_extreme <- eai %>% filter(IdeologyGroup == "Extreme")
eai_moderate <- eai %>% filter(IdeologyGroup == "Moderate")

mod1 <- glm(policy.based ~  num.watch + info.source + AGE_T1 + Income + Edu + female +
              VoterIdeo + gov.trust + Yoon + demo_satisfaction + policy_competition, 
            data = eai, family = "binomial")

stargazer(mod1, type='text')


#### New
# Generate dataframe including all necessary variables
pred_df_all <- data.frame(
  info.source = unique(eai$info.source[!is.na(eai$info.source)]),  # Exclude NA
  num.watch = mean(eai$num.watch, na.rm = TRUE),
  AGE_T1 = mean(eai$AGE_T1, na.rm = TRUE),
  Income = mean(eai$Income, na.rm = TRUE),
  Edu = mean(eai$Edu, na.rm = TRUE),
  female = mean(eai$female, na.rm = TRUE),
  VoterIdeo = mean(eai$VoterIdeo, na.rm = TRUE),
  gov.trust = mean(eai$gov.trust, na.rm = TRUE),
  Yoon = mean(eai$Yoon, na.rm = TRUE),
  demo_satisfaction = mean(eai$demo_satisfaction, na.rm = TRUE),  # Added!
  policy_competition = mean(eai$policy_competition, na.rm = TRUE)  # Added!
)

# Check dataframe
print("Prediction Dataset:")
print(pred_df_all)

# Calculate predictions
predictions <- predict(mod1, newdata = pred_df_all, type = "response", se.fit = TRUE)

# Generate results dataframe
results_df <- data.frame(
  info.source = pred_df_all$info.source,
  pred = predictions$fit,
  se = predictions$se.fit
) %>%
  mutate(
    lower = pred - (1.96 * se),
    upper = pred + (1.96 * se)
  )

print(results_df)

# Visualization
ggplot(results_df, aes(x = info.source, y = pred)) +
  geom_point(size = 3, color = "darkblue") +
  geom_errorbar(aes(ymin = lower, ymax = upper), 
                width = 0.2, 
                color = "darkblue",
                size = 1) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") +
  labs(x = "Information Source",
       y = "Predicted Probability of Policy-Based Voting",
       title = "Marginal Effects of Information Sources on Policy-Based Voting") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )

# Save graph
ggsave("info_source_marginal_effects_FINAL.png", width = 10, height = 7, dpi = 300)

# Print numerical results
print("\nInformation Source Marginal Effects (with 95% CI):")
print(results_df %>% 
        mutate(across(where(is.numeric), ~round(., 3))) %>%
        arrange(desc(pred)))



model_extreme <- glm(policy.based ~  num.watch + info.source + AGE_T1 + Income + Edu + female +
                       gov.trust + Yoon  + demo_satisfaction + policy_competition, 
                     data = eai_extreme, family = "binomial")

stargazer(model_extreme, type='text')

model_moderate <- glm(policy.based ~  num.watch + info.source + AGE_T1 + Income + Edu + female +
                        gov.trust + Yoon + demo_satisfaction + policy_competition , 
                      data = eai_moderate, family = "binomial")

stargazer(model_moderate, type='text') # Significant only in moderate model


#### Partisan vs nonpartisan
# Generate partisan subsample for cases where Yoon==1 or Lee==1
eai_partisan <- eai %>%
  filter(Yoon == 1 | Lee == 1)
eai_nonpartisan <- eai %>%
  filter(Yoon == 0 & Lee == 0)
nrow(eai_partisan) # 1021
nrow(eai_nonpartisan) # 1021

mod1 <- glm(policy.based ~  num.watch + info.source + AGE_T1 + Income + Edu + female +
              VoterIdeo + gov.trust + Yoon +  demo_satisfaction + policy_competition , 
            data = eai, family = "binomial")

stargazer(mod1, type='text')


mod1_partisan <- glm(policy.based ~  num.watch + info.source + AGE_T1 + Income + Edu + female +
                       VoterIdeo + gov.trust + Yoon +  demo_satisfaction + policy_competition , 
                     data = eai_partisan, family = "binomial")

stargazer(mod1_partisan, type='text') # Not significant

mod1_nonpartisan <- glm(policy.based ~  num.watch + info.source + AGE_T1 + Income + Edu + female +
                          VoterIdeo + gov.trust + Yoon  + demo_satisfaction + policy_competition , 
                        data = eai_nonpartisan, family = "binomial")

stargazer(mod1_nonpartisan, type='text')  # Too small sample size 



#### Appendix report
stargazer(model_extreme, model_moderate, mod1_partisan, 
          type = 'text',
          title = "Regression Results",
          column.labels = c("Extreme Model", "Moderate Model", "Partisan Model"),
          model.numbers = FALSE)
